Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.

GO Term and KEGG Enrichment Analysis

Run GO Term enrichment analysis using results from DGE analysis (i.e., after running the 1a-DGE.Rmd script). Red text indicates regions that require the user to modify. Regardless, the user should check over all code blocks to ensure that everything is running correctly.

1. Load packages

Load packages

library(goseq)
library(tidyverse)
library(GSEABase)
library(data.table)
library(ggplot2)
library(cowplot)
library(patchwork)

2. Set variables

Change file names and conditions where appropriate.

# Input unfiltered data
treatmentinfo.file <- "salmon.numreads.DGE_samples.tsv"
dge_results.file <- "salmon.numreads.DGE_results.tsv"
annotation.file <- "Pocillopora_acuta_KBHIv2.pep.GOs.tsv"

# Output DEG results
GO_term_sig.file <- "salmon.numreads.DGE_results.GOsig.tsv"
KEGG_sig.file <- "salmon.numreads.DGE_results.KEGGsig.tsv"

# Cutoff for significant GO term p-values
GO_enrich_pvalue.cutoff <- 0.05
KEGG_enrich_pvalue.cutoff <- 0.05

# KEGG ID description file - provided with script
KEGG_IDs_Descriptions.file <- "1b-KEGG_IDs_Descriptions.tsv"

3. Load, clean, and pre-processing datasets

Load the input file containing the treatment information.

treatmentinfo <- read.csv(treatmentinfo.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM") # Read in file
head(treatmentinfo)
##              sample_id unit lib_type         fq1         fq2 species plug_id
## 1 Pacuta_ATAC_TP8_1051    1       pe SRR14610912 SRR14610912  Pacuta    1051
## 2 Pacuta_ATAC_TP8_1755    1       pe SRR14610911 SRR14610911  Pacuta    1755
## 3 Pacuta_ATAC_TP8_2012    1       pe SRR14610910 SRR14610910  Pacuta    2012
## 4 Pacuta_HTAC_TP8_1329    1       pe SRR14611019 SRR14611019  Pacuta    1329
## 5 Pacuta_HTAC_TP8_1765    1       pe SRR14611018 SRR14611018  Pacuta    1765
## 6 Pacuta_HTAC_TP8_2513    1       pe SRR14611017 SRR14611017  Pacuta    2513
##   treatment timepoint            reef ploidy  group
## 1      ATAC         8      Reef.11.13      3 Group1
## 2      ATAC         8      Reef.42.43      2 Group5
## 3      ATAC         8      Reef.42.43      3 Group2
## 4      HTAC         8 Lilipuna.Fringe      2 Group6
## 5      HTAC         8      Reef.42.43      3 Group3
## 6      HTAC         8         Reef.18      3 Group2
# Check we have the right column names
headers <- c("sample_id")
if( all(headers %in% colnames(treatmentinfo)) ){
  print(paste(treatmentinfo.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(treatmentinfo.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.DGE_samples.tsv has the required columns: sample_id"

Load the input file containing DGE results.

dge_results <- as.data.frame(read.csv(dge_results.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM")) # Read in file

# Check we have the right column names
headers <- c("gene_id")
if( all(headers %in% colnames(dge_results)) ){
  print(paste(dge_results.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(dge_results.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.DGE_results.tsv has the required columns: gene_id"

Load the input file containing gene annotations (GO terms, KEGG IDs, and gene/protein lengths).

# Annotations
annot <- read.csv(annotation.file, header=TRUE, sep="\t", fileEncoding="UTF-8-BOM")

# Check we have the right column names
headers <- c("gene_id", "GO_IDs", "KEGG_IDs", "length")
if( all(headers %in% colnames(annot)) ){
  paste(paste(annotation.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
  stop(paste(annotation.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "Pocillopora_acuta_KBHIv2.pep.GOs.tsv has the required columns: gene_id, GO_IDs, KEGG_IDs, length"
# Select only relevant information
Go.ref <- subset(annot, select= c(gene_id, length))

# Merge dge_results by available annotations
Go.ref <- merge(dge_results, Go.ref, by = "gene_id")
Go.ref <- unique(Go.ref)
dim(Go.ref)
## [1] 20  8

Set ID and gene length vectors, and make a binary matrix indicating which genes are differentially expressed. These are used as input to nullp, which for calculates a Probability Weighting Function for DEGs.

# Make ID and length vectors
IDvector <- annot$gene_id
lengthVector <- annot$length

# Get binary list indicating which genes are in DGE results and which are not out of all genes
dge.genes <- as.integer(annot$gene_id %in% Go.ref$gene_id)
names(dge.genes) <- annot$gene_id
print(paste("Number of DGE genes:     ", length(dge.genes[dge.genes == 1]), sep=''))
## [1] "Number of DGE genes:     20"
print(paste("Number of NON DGE genes: ", length(dge.genes[dge.genes == 0]), sep=''))
## [1] "Number of NON DGE genes: 24892"
# Weight vector by length of gene
pwf <- nullp(DEgenes=dge.genes, id=IDvector, bias.data=lengthVector)
## Warning in pcls(G): initial point very close to some inequality constraints

Prepare GO term dataframe.

# Cleanup GO terms and split multiple GO terms per line, into one GO term per line
GO.annot <- annot %>%
  subset(select=c(gene_id, GO_IDs)) %>%
  drop_na() %>% # Remove rows with missing GO terms or gene IDs
  filter(GO_IDs!="-") # Remove rows with missing values indicated by '-'
splitted <- strsplit(as.character(GO.annot$GO_IDs), "; ") # Split into multiple GO ids
GO.terms <- data.frame(v1 = rep.int(GO.annot$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their GO terms in a single row
colnames(GO.terms) <- c("gene_id", "GO.ID")

# Cleanup single-line GO term dataframe
GO.terms <- GO.terms %>%
  mutate(GO.ID = gsub(" ", "", GO.ID)) %>% # Remove spaces from GO terms - in case any exist by mistake
  mutate(GO.ID = as.character(GO.ID)) %>%
  mutate(GO.ID = factor(GO.ID, levels=unique(GO.ID))) %>%
  mutate(gene_id = factor(gene_id, levels=unique(gene_id))) %>%
  unique()

# Print stats
print(paste("No rows in 'GO.terms': ", dim(GO.terms)[1], sep=''))
## [1] "No rows in 'GO.terms': 1655777"
print(paste("Avg GO IDs per gene: ", nrow(GO.terms) / length(unique(GO.terms$gene_id)), sep=''))
## [1] "Avg GO IDs per gene: 142.26110490592"
head(GO.terms)
##                                        gene_id      GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001101
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001932
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001933
## 4 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002009
## 5 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002020
## 6 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002165

Find enriched GO terms, “selection-unbiased testing for category enrichment amongst significantly expressed genes for RNA-seq data”.

GOall <- goseq(pwf, GOref$gene_id, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns

Find only enriched GO terms that are statistically significant at cutoff.

GOall.filtered <- GOall %>% 
  filter(over_represented_pvalue < GO_enrich_pvalue.cutoff | under_represented_pvalue < GO_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
  arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
  mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
  mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff, 
                       "Over", 
                       if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff, 
                               "Under", 
                               "NULL"
                              )
                      )
        )

# Print stats
print(paste("Number GOs BEFORE sig filtering: ", nrow(GOall), sep=''))
## [1] "Number GOs BEFORE sig filtering: 21041"
print(paste("Number GOs AFTER sig filtering:  ", nrow(GOall.filtered), sep=''))
## [1] "Number GOs AFTER sig filtering:  81"
print(paste("Number GOs AFTER sig filtering OVER:  ", nrow(GOall.filtered %>% filter(over_represented_pvalue  < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering OVER:  81"
print(paste("Number GOs AFTER sig filtering UNDER: ", nrow(GOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering UNDER: 0"
print(paste("Number sig BP terms: ", nrow(filter(GOall.filtered, ontology=="BP")), sep=''))
## [1] "Number sig BP terms: 53"
print(paste("Number sig MF terms: ", nrow(filter(GOall.filtered, ontology=="MF")), sep=''))
## [1] "Number sig MF terms: 10"
print(paste("Number sig CC terms: ", nrow(filter(GOall.filtered, ontology=="CC")), sep=''))
## [1] "Number sig CC terms: 13"

Correct any un-annotated terms/ontologies.

NAs.ontology <- GOall.filtered %>% subset(is.na(term))
NAs.ontology
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 77 GO:0044214              0.01175946                0.9999387          1
## 78 GO:0089717              0.01175946                0.9999387          1
## 79 GO:0008329              0.01418905                0.9999095          1
## 80 GO:1901483              0.03456946                0.9994421          1
## 81 GO:1901485              0.03456946                0.9994421          1
##    numInCat term ontology  dir
## 77       14 <NA>     <NA> Over
## 78       14 <NA>     <NA> Over
## 79       17 <NA>     <NA> Over
## 80       34 <NA>     <NA> Over
## 81       34 <NA>     <NA> Over

Save significant terms.

write.table(GOall.filtered, file = GO_term_sig.file, row.names=F, quote=F, sep='\t')

4. Find GOslim terms

Run GOslim to get broader categories.

slim <- getOBOCollection("http://current.geneontology.org/ontology/subsets/goslim_generic.obo") # Get GO database

## BP
BP_GO <- GOall.filtered %>%
  filter(ontology=="BP")
BPGO_collection <- GOCollection(BP_GO$category) # Make library of query terms
slims_bp <- data.frame(goSlim(BPGO_collection, slim, "BP")) # Find common parent terms to slim down our list
slims_bp$category <- row.names(slims_bp) # Save rownames as category

## MF
MF_GO <- GOall.filtered %>%
  filter(ontology=="MF")
MFGO_collection <- GOCollection(MF_GO$category) # Make library of query terms
slims_mf <- data.frame(goSlim(MFGO_collection, slim, "MF")) # Find common parent terms to slim down our list
slims_mf$category <- row.names(slims_mf) # Save rownames as category

Get mapped terms, using functions from Sam White’s Biostars post.

# Write function mappedIds to get the query terms that mapped to the slim categories
mappedIds <-function(df, collection, OFFSPRING) { # The command to run requires a dataframe of slim terms, like slims_MF above, your list of query terms, and the offspring from the GOCollection by goSlim
  map <- as.list(OFFSPRING[rownames(df)]) # Subset GOcollection offspring by the rownames of your dataframe
  mapped <- lapply(map, intersect, ids(collection)) # Find the terms that intersect between the subset made above of your query terms and the GOids from the GO collection
  df[["go_terms"]] <- vapply(unname(mapped), paste, collapse = ";", character(1L)) # Add column "go_terms" with matching terms 
  df # Show resulting dataframe
}

# Run function for MF and BP terms
BPslim <- mappedIds(slims_bp, BPGO_collection, GOBPOFFSPRING)
MFslim <- mappedIds(slims_mf, MFGO_collection, GOMFOFFSPRING)

Remove duplicate matches, keeping the broader umbrella term

# BP
BPslim <- filter(BPslim, Count>0 & Term!="biological_process") # Filter out empty slims and term "biological process"
BPsplitted <- strsplit(as.character(BPslim$go_terms), ";") # Split into multiple GO ids
BPslimX <- data.frame(Term = rep.int(BPslim$Term, sapply(BPsplitted, length)), go_term = unlist(BPsplitted)) # List all
BPslimX <- merge(BPslimX, BPslim[,c(1,3:4)], by="Term") # Add back counts, term, and category info
BPslimX <- unique(setDT(BPslimX)[order(go_term, -Count)], by = "go_term") # Remove duplicate offspring terms, keeping only those that appear in the larger umbrella term (larger Count number)
BPslim <- data.frame(slim_term=BPslimX$Term, slim_cat=BPslimX$category, category=BPslimX$go_term) # Rename columns
head(BPslim)
##                slim_term   slim_cat   category
## 1  membrane organization GO:0061024 GO:0006911
## 2  immune system process GO:0002376 GO:0006955
## 3 chromatin organization GO:0006325 GO:0007549
## 4              signaling GO:0023052 GO:0007603
## 5              signaling GO:0023052 GO:0008377
## 6 chromatin organization GO:0006325 GO:0009047
# MF
MFslim <- filter(MFslim, Count>0 & Term!="molecular_function") # Filter out empty slims and term "molecular function"
MFsplitted <- strsplit(as.character(MFslim$go_terms), ";") # Split into multiple GO ids
MFslimX <- data.frame(Term = rep.int(MFslim$Term, sapply(MFsplitted, length)), go_term = unlist(MFsplitted)) # List all
MFslimX <- merge(MFslimX, MFslim[,c(1,3:4)], by="Term")  # Add back counts, term, and category info
MFslimX <- unique(setDT(MFslimX)[order(go_term, -Count)], by = "go_term")  # Remove duplicate offspring terms, keeping only
MFslim <- data.frame(slim_term=MFslimX$Term, slim_cat=MFslimX$category, category=MFslimX$go_term) # Rename columns
head(MFslim)
##                               slim_term   slim_cat   category
## 1                         lipid binding GO:0008289 GO:0001530
## 2               cargo receptor activity GO:0038024 GO:0005044
## 3                  transporter activity GO:0005215 GO:0010461
## 4 molecular function regulator activity GO:0098772 GO:0038177
## 5         molecular transducer activity GO:0060089 GO:0038187

Add back GO enrichment info for each offspring term.

GO.BP <- right_join(BPslim, filter(GOall.filtered, ontology=="BP"), by="category")
GO.MF <- right_join(MFslim, filter(GOall.filtered, ontology=="MF"), by="category")

5. Make heatmap

Plot heatmap of BP and MF GO slim terms.

term_label_text_size <- 6
slim_label_text_size <- 6

BPplot <- GO.BP %>%
  filter(numInCat>5) %>%
  mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
  ggplot(aes(x = dir, y = term)) + 
    geom_tile(aes(fill=over_represented_pvalue, width = 1)) + 
    facet_grid(slim_term ~ ontology, scales = "free_y", labeller = label_wrap_gen(width = 10, multi_line = TRUE))+
    theme_bw() +
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(colour = "black"),
          strip.text.y = element_text(angle=0, size = slim_label_text_size, face = "bold"),
          strip.text.x = element_text(size = 12, face = "bold"),
          axis.title.x = element_blank(),
          axis.title.y = element_text(size=15),
          axis.text = element_text(size = term_label_text_size), legend.position = "None",
          plot.margin = unit(c(0,1,0,0.25), "cm")
    ); BPplot

MFplot <- GO.MF %>%
  filter(numInCat>5) %>%
  mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
  ggplot(aes(x = dir, y = term)) + 
    geom_tile(aes(fill=over_represented_pvalue, width = 1)) + 
    scale_y_discrete(position = "right") +
    facet_grid(slim_term ~ ontology,
               scales = "free_y",
               labeller = label_wrap_gen(width = 10, multi_line = TRUE), 
               switch="y" # Put the y facet strips on the left
              ) +
    theme_bw() +
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(colour = "black"),
          strip.text.y.left = element_text(angle=0, size = slim_label_text_size, face = "bold"),
          strip.text.x = element_text(size = 12, face = "bold"),
          axis.title = element_blank(),
          axis.text = element_text(size = term_label_text_size),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 11)
    ); MFplot

Combined BP and MF plots.

BPplot + MFplot + plot_annotation(tag_levels = "A", tag_suffix = ")") & theme(plot.tag = element_text(size=15, face="bold"))

6. Kegg enrichment analysis

Select KEGG Orthogroup IDs (KOs) from annotation dataframe.

# Extract gene_ids and KO from annotation file
KO.terms <- annot %>%
  subset(select=c(gene_id, KEGG_IDs)) %>% # Select columns
  drop_na() %>% # Remove rows with missing GO terms or gene IDs
  filter(KEGG_IDs!="-") # Remove rows with missing values indicated by '-'

# Split multiple KEGG IDs per line into multiple lines
splitted <- strsplit(as.character(KO.terms$KEGG_IDs), "; ") # Split into multiple KEGG IDs
KO.terms <- data.frame(v1 = rep.int(KO.terms$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their KEGG IDs in a single row
colnames(KO.terms) <- c("gene_id", "KEGG_IDs")

KO.terms <- unique(KO.terms)
colnames(KO.terms) <- c("gene_id", "GO.ID")
head(KO.terms)
##                                        gene_id  GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1a K00549
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b K16866
## 3  Pocillopora_acuta_KBHIv2___RNAseq.g18333.t1 K03386
## 4   Pocillopora_acuta_KBHIv2___RNAseq.g7985.t1 K12418
## 5      Pocillopora_acuta_KBHIv2___TS.g15308.t1 K13755
## 6   Pocillopora_acuta_KBHIv2___RNAseq.g2057.t1 K11001
# Bind KO and GO references
GOKO.terms <- bind_rows(GO.terms, KO.terms)

Perform Kegg enrichment with goseq package.

KOall <- goseq(pwf, GOref$gene_id, gene2cat=GOKO.terms, test.cats=c("KEGG"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns

Extract significantly enriched KEGG terms.

KOall.filtered <- KOall %>% 
  filter(over_represented_pvalue < KEGG_enrich_pvalue.cutoff | under_represented_pvalue < KEGG_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
  arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
  mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
  mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff, 
                       "Over", 
                       if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff, 
                               "Under", 
                               "NULL"
                              )
                      )
        ) %>%
  mutate(ontology = replace_na(ontology, "KEGG")) %>%
  filter(ontology=="KEGG")

# Print stats
print(paste("Number KEGG IDs BEFORE sig filtering: ", nrow(KOall), sep=''))
## [1] "Number KEGG IDs BEFORE sig filtering: 28185"
print(paste("Number KEGG IDs AFTER sig filtering:  ", nrow(KOall.filtered), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering:  17"
print(paste("Number KEGG IDs AFTER sig filtering OVER:  ", nrow(KOall.filtered %>% filter(over_represented_pvalue  < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering OVER:  17"
print(paste("Number KEGG IDs AFTER sig filtering UNDER: ", nrow(KOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering UNDER: 0"

Add KO definitions.

# Load definition file
KEGG_IDs_Descriptions <- read.table(KEGG_IDs_Descriptions.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM", quote='') # Read in file

# Prep definition data
KEGG_IDs_Descriptions <- KEGG_IDs_Descriptions %>%
  subset(select=c("D.ID", "D.Description")) %>%
  unique()
colnames(KEGG_IDs_Descriptions) <- c("category", "description")

# Merge with KEGG output
KOall.filtered.descriptions <- unique(left_join(KOall.filtered, KEGG_IDs_Descriptions, by=c("category")))

Write output KEGG enrichment files.

write.table(KOall.filtered.descriptions, file = KEGG_sig.file, row.names=F, quote=F, sep='\t')